import pandas as pd
import geopandas as gpd
import numpy as np
import plotly
import plotly.express as px
pd.options.plotting.backend = "plotly" # making pandas.plot() use plotly rather than matplotlib
### GEOSPATIAL PACKAGES AND FUNCTIONS ###
from shapely.geometry import Point, Polygon, MultiPolygon
from geoalchemy2 import Geometry, WKTElement
# this helper function was taken from the Week 9 Tutorial
def create_wkt_element(geom, srid):
if geom.geom_type == 'Polygon':
geom = MultiPolygon([geom])
return WKTElement(geom.wkt, srid)
SA2_2016_AUST¶This is the definitive SA2 areas dataset sourced from the ABS. This dataset will accurately tell us which neighbourhoods are located in Greater Sydney, allowing us to only store these entries in our database.
regions = gpd.read_file("inputs/internal_datasets/SA2_2016_AUST/SA2_2016_AUST.shp")
greater_sydney = regions.copy()
greater_sydney = greater_sydney[greater_sydney["GCC_CODE16"] == "1GSYD"] # the Greater Capital City code for Greater Sydney
SA2_5DIG16 column.columns = ["SA2_5DIG16", "STE_CODE16", "STE_NAME16", "GCC_CODE16", "GCC_NAME16"]
greater_sydney.drop(columns = columns, inplace = True)
greater_sydney.rename(columns = {"SA2_MAIN16": "sa2_code", "SA2_NAME16": "sa2_name",
"SA3_CODE16": "sa3_code", "SA3_NAME16": "sa3_name",
"SA4_CODE16": "sa4_code", "SA4_NAME16": "sa4_name",
"AREASQKM16": "areasqkm"}, inplace = True)
greater_sydney.dtypes
sa2_code object sa2_name object sa3_code object sa3_name object sa4_code object sa4_name object areasqkm float64 geometry geometry dtype: object
pandas has parsed sa2_code, sa3_code, and sa4_code as objects. These variables should definitely be integers. Let's check for any strings in these columns which may have caused this issue.
cols_to_check = ["sa2_code", "sa3_code", "sa4_code"]
for col in cols_to_check:
if greater_sydney[greater_sydney[col].str.isnumeric() == False].empty:
print(f"No strings in column: {col}")
else:
print(f"{col} contains a string!")
No strings in column: sa2_code No strings in column: sa3_code No strings in column: sa4_code
All columns seem to be numeric (perhaps the error which caused the parsing was in the unfiltered dataset). Let's convert these columns now.
# converting columns to integer
for col in cols_to_check:
greater_sydney[col] = pd.to_numeric(greater_sydney[col])
Now we should convert the geometry column to WKT format, allowing PostGIS to push it to the database. First we check the SRID for the dataset.
with open("inputs/internal_datasets/SA2_2016_AUST/SA2_2016_AUST.prj") as f:
for line in f:
print(line)
GEOGCS["GCS_GDA_1994",DATUM["D_GDA_1994",SPHEROID["GRS_1980",6378137.0,298.257222101]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433],AUTHORITY["EPSG",4283]]
# from the .prj file "authority" field
srid_sa2 = 4283
greater_sydney["geom"] = greater_sydney["geometry"].apply(lambda x: create_wkt_element(geom = x, srid = srid_sa2))
greater_sydney.drop(columns = "geometry", inplace = True) # dropping the old `geometry` column
greater_sydney.dtypes
sa2_code int64 sa2_name object sa3_code int64 sa3_name object sa4_code int64 sa4_name object areasqkm float64 geom object dtype: object
Recall that we have filtered the entire SA2 regions dataset to only contain those SA2 regions in Greater Sydney. In other words, the sa2_code column now contains only the SA2 areas we want to focus on in this assignment; this list will be very useful in filtering out the Neighbourhoods.csv and BusinessStats.csv datasets, so we will store this in its own variable.
g_sydney_sa2_codes = greater_sydney["sa2_code"].to_list()
print(f"Number of SA2 regions in Greater Sydney: {len(g_sydney_sa2_codes)}")
Number of SA2 regions in Greater Sydney: 312
Hence Neighbourhoods.csv and BusinessStats.csv should eventually only have 312 entries.
Neighbourhoods.csv¶neighbourhoods = pd.read_csv("inputs/internal_datasets/Neighbourhoods.csv")
neighbourhoods.head(3)
| Unnamed: 0 | area_id | area_name | land_area | population | number_of_dwellings | number_of_businesses | median_annual_household_income | avg_monthly_rent | 0-4 | 5-9 | 10-14 | 15-19 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 102011028 | Avoca Beach - Copacabana | 643.8 | 7590 | 2325 | 738.0 | 46996.0 | 1906.0 | 467 | 583 | 604 | 560 |
| 1 | 1 | 102011029 | Box Head - MacMasters Beach | 3208.6 | 10986 | 3847 | 907.0 | 42621.0 | 1682.0 | 586 | 696 | 661 | 692 |
| 2 | 2 | 102011030 | Calga - Kulnura | 76795.1 | 4841 | 1575 | 1102.0 | 42105.0 | 1182.0 | 220 | 254 | 304 | 320 |
Some information easily identified from this dataframe:
number_of_dwellings and number_of_businesses columns as they are not related to our project.area_id should be a primary key, of type CHAR(9). This column is actually the SA2 code for the neighbourhood.area_name should be cast as a string. This column is actually the SA2 name for the neighbourhood.med_ann_house_income, and avg_monthly_rent should be cast as floats.young_pop column.# dropping columns
neighbourhoods.drop(columns = ["Unnamed: 0", "number_of_dwellings", "number_of_businesses", "land_area"], inplace = True)
Let's also rename these columns for clarity and ease of use.
columns_dict = {"median_annual_household_income": "med_ann_house_income",
"area_id": "sa2_code",
"area_name": "sa2_name"}
neighbourhoods.rename(columns = columns_dict, inplace = True)
neighbourhoods["young_pop"] = neighbourhoods["0-4"] + neighbourhoods["5-9"] + neighbourhoods["10-14"] + neighbourhoods["15-19"]
neighbourhoods.drop(columns = ["0-4", "5-9", "10-14", "15-19"], inplace = True)
neighbourhoods.dtypes
sa2_code int64 sa2_name object population object med_ann_house_income float64 avg_monthly_rent float64 young_pop int64 dtype: object
Interestingly, Pandas has automatically cast population as an object. Let's confirm whether the column contains characters other than numbers, and hence maybe why Pandas automatically converted these entries into strings.
neighbourhoods[neighbourhoods["population"].str.isnumeric() == False]["population"]
312 12,670 313 6,910 314 4,662 315 12,818 316 8,262 317 7,931 318 4,919 319 14,959 320 6,025 321 6,589 Name: population, dtype: object
It seems that in some entries, the population data contains a comma as a value separator! We should remove them.
neighbourhoods["population"].replace(',', '', regex = True, inplace = True)
neighbourhoods["population"] = pd.to_numeric(neighbourhoods["population"])
# the cleaned datatypes of the dataset
neighbourhoods.dtypes
sa2_code int64 sa2_name object population float64 med_ann_house_income float64 avg_monthly_rent float64 young_pop int64 dtype: object
Looking good (why population is a float is explained by the presence of a NaN value; see next section).
neighbourhoods.isna().sum()
sa2_code 0 sa2_name 0 population 1 med_ann_house_income 8 avg_monthly_rent 12 young_pop 0 dtype: int64
There are several entries with missing data in this dataset. Let's examine each one of these.
# printing the list of areas with unknown population
neighbourhoods[neighbourhoods["population"].isna()]["sa2_name"]
194 Holsworthy Military Area Name: sa2_name, dtype: object
# printing the list of areas with unknown average monthly rent
neighbourhoods[neighbourhoods["avg_monthly_rent"].isna()]["sa2_name"]
64 Prospect Reservoir 66 Banksmeadow 70 Port Botany Industrial 71 Sydney Airport 88 Centennial Park 194 Holsworthy Military Area 206 Blue Mountains - North 211 Blue Mountains - South 229 Rookwood Cemetery 246 Smithfield Industrial 247 Yennora Industrial 287 Wetherill Park Industrial Name: sa2_name, dtype: object
# printing the list of areas with unknown median household income
neighbourhoods[neighbourhoods["med_ann_house_income"].isna()]["sa2_name"]
70 Port Botany Industrial 71 Sydney Airport 88 Centennial Park 194 Holsworthy Military Area 206 Blue Mountains - North 211 Blue Mountains - South 229 Rookwood Cemetery 307 Royal National Park Name: sa2_name, dtype: object
These results seem to make sense — it is not that the dataset is incomplete, but simply that these areas would not hold a value for variables related to household income or rent because there are no households in these areas. These areas are either industrial areas or simply unresided land (e.g. Holsworthy Military Area or Prospect Reservoir). We will deal with these areas soon.
This also explains why pandas has stored population as a float — NaN values cannot be stored as integers.
Below is a simple algorithm which checks which SA2 codes are present in neighbourhoods but not present in greater_sydney. It then removes these entries from neighbourhoods.
print(f"Number of entries prior: {neighbourhoods.shape[0]}")
Number of entries prior: 322
neighbourhoods_sa2_codes = neighbourhoods["sa2_code"].to_list()
rural_neighbourhoods = []
print("Regions which are not in Greater Sydney:")
for neigh_code in neighbourhoods_sa2_codes:
if neigh_code not in g_sydney_sa2_codes:
rural_neighbourhoods.append(neigh_code)
print(neighbourhoods[neighbourhoods["sa2_code"] == neigh_code]["sa2_name"].to_string())
Regions which are not in Greater Sydney: 312 Goulburn Region 313 Bathurst Region 314 Oberon 315 Lithgow 316 Lithgow Region 317 Cessnock Region 318 Singleton Region 319 Morisset - Cooranbong 320 Hill Top - Colo Vale 321 Southern Highlands
We can see that indeed, the regions in neighbourhoods which are not in Greater Sydney are actually rural/remote areas.
# dropping these entries from the dataset
for code in rural_neighbourhoods:
neighbourhoods.drop(neighbourhoods[neighbourhoods["sa2_code"] == code].index, inplace = True)
print(f"Number of entries after: {neighbourhoods.shape[0]}")
Number of entries after: 312
BusinessStats.csv¶business = pd.read_csv("inputs/internal_datasets/BusinessStats.csv")
business.head(3)
| area_id | area_name | number_of_businesses | accommodation_and_food_services | retail_trade | agriculture_forestry_and_fishing | health_care_and_social_assistance | public_administration_and_safety | transport_postal_and_warehousing | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 101021007 | Braidwood | 629 | 26 | 27 | 280 | 11 | 0 | 35 |
| 1 | 101021008 | Karabar | 326 | 7 | 10 | 8 | 11 | 0 | 43 |
| 2 | 101021009 | Queanbeyan | 724 | 52 | 47 | 11 | 56 | 3 | 77 |
The columns of interest for us from this dataset are:
area_id (primary key)area_nameaccommodation_and_food_servicesretail_trade andhealth_care_and_social_assistanceThe other columns do not play a role in our liveability analysis, and hence we can remove them.
cols_to_drop = ["number_of_businesses", "agriculture_forestry_and_fishing", "public_administration_and_safety", "transport_postal_and_warehousing"]
business.drop(columns = cols_to_drop, inplace = True)
columns_dict = {"area_id": "sa2_code",
"area_name": "sa2_name",
"accommodation_and_food_services": "accom_food",
"retail_trade": "retail",
"health_care_and_social_assistance": "health"}
business.rename(columns = columns_dict, inplace = True)
business.dtypes
sa2_code int64 sa2_name object accom_food int64 retail int64 health int64 dtype: object
It seems that this dataset is much cleaner than Neighbourhoods.csv in terms of its data quality — all data types have been correctly identified.
business.isna().sum().sum()
0
There are no missing values in the entire dataset — how pleasant.
Once again, we only store those entries in business whose sa2_code exists in greater_sydney.
print(f"Number of entries prior: {business.shape[0]}")
Number of entries prior: 2301
business_sa2_codes = business["sa2_code"].to_list()
rural_business = []
for business_code in business_sa2_codes:
if business_code not in g_sydney_sa2_codes:
rural_business.append(business_code)
# dropping these entries from the dataset
for code in rural_business:
business.drop(business[business["sa2_code"] == code].index, inplace = True)
print(f"Number of entries after: {business.shape[0]}")
Number of entries after: 312
neighbourhoods and business datasets¶neighbourhoods provides us information on populations, average rent, median income and areas of each SA2 area.business provides us information on the number of hospitality, retail, and health businesses which exist in each SA2 area.As these two datasets are both providing statistical/numerical information regarding the same SA2 areas it makes sense to combine these into one unified dataset. Note that we are not merging greater_sydney into this unified dataset as it contains geospatial data. It forms a better and more logical database schema to keep numerical and geospatial data separated.
neighbourhood_stats = business.merge(neighbourhoods, how = "inner")
# reordering columns
neighbourhood_stats = neighbourhood_stats[["sa2_code", "sa2_name", "population",
"young_pop", "med_ann_house_income",
"avg_monthly_rent", "accom_food",
"retail", "health"]]
print(f"Number of entries after merge: {business.shape[0]}") # should be 312
Number of entries after merge: 312
neighbourhood_stats.head(3)
| sa2_code | sa2_name | population | young_pop | med_ann_house_income | avg_monthly_rent | accom_food | retail | health | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 102011028 | Avoca Beach - Copacabana | 7590.0 | 2214 | 46996.0 | 1906.0 | 33 | 35 | 60 |
| 1 | 102011029 | Box Head - MacMasters Beach | 10986.0 | 2635 | 42621.0 | 1682.0 | 23 | 45 | 43 |
| 2 | 102011030 | Calga - Kulnura | 4841.0 | 1098 | 42105.0 | 1182.0 | 14 | 43 | 12 |
Recall that while cleaning the Neighbourhoods.csv dataset, we stumbled upon a few areas in Greater Sydney with null values, and we reasoned that these null values exist because the areas themselves were unresided. Let's take another look at these areas.
# printing all neighbourhoods where the population is less than 1000 or young_pop is less than 100
# this includes neighbourhoods with any NA values in their entry
empty_neighbourhoods = neighbourhood_stats[(neighbourhood_stats["population"] < 1000) | (neighbourhood_stats["young_pop"] < 100)]["sa2_name"].to_list()
for i in empty_neighbourhoods:
print(i)
Prospect Reservoir Banksmeadow Port Botany Industrial Sydney Airport Centennial Park Holsworthy Military Area Blue Mountains - North Blue Mountains - South Rookwood Cemetery Smithfield Industrial Yennora Industrial Badgerys Creek Wetherill Park Industrial Royal National Park
Also, given that Badgery's Creek is now under construction to be the home of Sydney's second airport, with schools and churches being moved and residents moving out (Badgerys Creek: Suburb becomes deserted as airport construction looms), we believe it does not really make sense to consider this neighbourhood in a liveability analysis.
As this assignment is about liveability, we believe it is justified to convert the descriptive data in all the above suburbs to null. This would ensure that
Let's do this now.
cols_to_na = ["population", "young_pop", "med_ann_house_income", "avg_monthly_rent", "accom_food", "retail", "health"]
for neighbourhood in empty_neighbourhoods:
for col in cols_to_na:
neighbourhood_stats.loc[neighbourhood_stats["sa2_name"] == neighbourhood, col] = np.nan
school_catchments¶This dataset contains three separate shape files:
catchments_future.shpcatchments_primary.shp andcatchments_secondary.shpLet's import these three files.
directory = "inputs/internal_datasets/school_catchments/"
future_catch = gpd.read_file(directory + "catchments_future.shp")
primary_catch = gpd.read_file(directory + "catchments_primary.shp")
secondary_catch = gpd.read_file(directory + "catchments_secondary.shp")
future_catch.head(1)
| USE_ID | CATCH_TYPE | USE_DESC | ADD_DATE | KINDERGART | YEAR1 | YEAR2 | YEAR3 | YEAR4 | YEAR5 | YEAR6 | YEAR7 | YEAR8 | YEAR9 | YEAR10 | YEAR11 | YEAR12 | geometry | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2133 | PRIMARY | Harbord PS | 20200720 | 2023 | 2023 | 2023 | 2023 | 2023 | 2023 | 2023 | 0 | 0 | 0 | 0 | 0 | 0 | POLYGON ((151.29769 -33.76832, 151.29731 -33.7... |
primary_catch.head(1)
| USE_ID | CATCH_TYPE | USE_DESC | ADD_DATE | KINDERGART | YEAR1 | YEAR2 | YEAR3 | YEAR4 | YEAR5 | YEAR6 | YEAR7 | YEAR8 | YEAR9 | YEAR10 | YEAR11 | YEAR12 | PRIORITY | geometry | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2838 | PRIMARY | Parklea PS | 20181210 | Y | Y | Y | Y | Y | Y | Y | N | N | N | N | N | N | None | POLYGON ((150.93564 -33.71612, 150.93715 -33.7... |
secondary_catch.head(1)
| USE_ID | CATCH_TYPE | USE_DESC | ADD_DATE | KINDERGART | YEAR1 | YEAR2 | YEAR3 | YEAR4 | YEAR5 | YEAR6 | YEAR7 | YEAR8 | YEAR9 | YEAR10 | YEAR11 | YEAR12 | PRIORITY | geometry | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 8503 | HIGH_COED | Billabong HS | 20200507 | N | N | N | N | N | N | N | Y | Y | Y | Y | Y | Y | None | POLYGON ((146.67182 -35.31444, 146.68930 -35.3... |
Immediately we can comment on some data quality and integration issues in the datasets:
USE_ID will be the primary key for all three datasets, and should be an integer.KINDERGART to YEAR12 are not required and can be removed.ADD_DATE column for all three datasets is actually a poorly-formatted date rather than an integer (which is what Pandas would have interpreted it as). Normally we would fix this, however as this column is not strictly needed for our analysis, it will be easier to just drop it from the datasets.PRIORITY column for primary_catch and secondary_catch is not needed for our analysis and can be removed; this column is likely for those schools which are major public infrastructure projects and thus garner a lot of media attention. We can see this as the only school listed as high priority is the "Inner Sydney High School" — a first-of-its-kind highrise high school.GEOMETRY column for all three datasets needs to be converted into the Well-Known Text (WKT) format to allow it to be ingested by the PostGres database.# dropping columns
datasets = [future_catch, primary_catch, secondary_catch]
columns = ["ADD_DATE", "PRIORITY", "KINDERGART", "YEAR1", "YEAR2", "YEAR3", "YEAR4", "YEAR5", "YEAR6",
"YEAR7", "YEAR8", "YEAR9", "YEAR10", "YEAR11", "YEAR12"]
for df in datasets:
for col in columns:
if col in df.columns:
df.drop(columns = col, inplace = True)
# renaming columns
new_col_names = {"USE_ID": "school_id", "CATCH_TYPE": "catch_type", "USE_DESC": "name"}
for df in datasets:
df.rename(columns = new_col_names, inplace = True)
# all three datasets:
# converting `USE_ID` to integer
for df in datasets:
df["school_id"] = pd.to_numeric(df["school_id"])
Now, in order to convert geometry to WKT format, we first need to examine the Spatial Reference Identifier (SRID) for the dataset. This metadata can be found in the .prj file for each shape file.
with open("inputs/internal_datasets/school_catchments/catchments_secondary.prj", "r") as f:
for line in f:
print(line)
GEOGCS["GCS_GDA_1994",DATUM["D_GDA_1994",SPHEROID["GRS_1980",6378137.0,298.257222101]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433],AUTHORITY["EPSG",4283]]
We can see the AUTHORITY label tells us the EPSG (worldwide coordinate system) is 4283. This is the same for all three of our datasets. Hence using this, we can convert the column to WKT format.
# all three datasets:
# converting geometry into WKT format
srid_school = 4283
for df in datasets:
df["geom"] = df["geometry"].apply(lambda x: create_wkt_element(geom = x, srid = srid_school))
df.drop(columns = "geometry", inplace = True) # dropping the old `geometry` column
Let's check if all our datatypes have been correctly interpreted.
dtypes_df = pd.DataFrame({"column": future_catch.columns,
"future_catch_dtype": future_catch.dtypes.to_list(),
"primary_catch_dtype": primary_catch.dtypes.to_list(),
"secondary_catch_dtype": secondary_catch.dtypes.to_list()})
dtypes_df
| column | future_catch_dtype | primary_catch_dtype | secondary_catch_dtype | |
|---|---|---|---|---|
| 0 | school_id | int64 | int64 | int64 |
| 1 | catch_type | object | object | object |
| 2 | name | object | object | object |
| 3 | geom | object | object | object |
catch_type and name are objects are they are strings. geom is an object as its now Well-Known Text.
Looks pretty good!
if future_catch[future_catch.duplicated()].empty:
print("future_catch has no duplicates.")
else:
print("Duplicates found!")
future_catch has no duplicates.
if primary_catch[primary_catch.duplicated()].empty:
print("primary_catch has no duplicates.")
else:
print("Duplicates found!")
primary_catch has no duplicates.
if secondary_catch[secondary_catch.duplicated()].empty:
print("secondary_catch has no duplicates.")
else:
print("Duplicates found!")
secondary_catch has no duplicates.
There does not seem to be any duplicate entries within each of these datasets.
Eventually we want to have one school_catchments dataset which combines the data from all 3 of our current datasets. We will achieve this by concatenating one dataset to the end of another. However, before we conduct this operation, we should check whether there are any entries which are in two or more datasets. We will do this using an inner merge.
f_p = future_catch.merge(primary_catch, how = "inner", on = "school_id")
f_s = future_catch.merge(secondary_catch, how = "inner", on = "school_id")
p_s = primary_catch.merge(secondary_catch, how = "inner", on = "school_id")
f_p_s = f_p.merge(secondary_catch, how = "inner", on = "school_id")
f_p.shape
(20, 7)
f_s.shape
(20, 7)
p_s.shape
(63, 7)
f_p_s.shape
(1, 10)
What interesting results. Let's examine these one-by-one.
primary_catch and secondary_catch contain 20 entries each from future_catch.¶These 40 entries are not unique — from our final result above we know that 1 entry exists in all 3 datasets. This tells us that even though there are 20 duplicated entries in both primary_catch and secondary_catch, 1 of these entries is common, and so in total there are 39 entries in future_catch which are present in the other two datasets. Keep in mind that the length of future_catch is only 44, so only 5 entries have not been duplicated!
We will need to remove these 39 entries from future_catch to solve this issue (these entries will still exist in primary_catch and secondary_catch so we are not losing any information). At the end of this process, future_catch should only have 5 entries left.
As for a reason to this overlap, we believe the future_catch dataset not only includes brand new schools which are taking in their first cohorts in the upcoming year, but also any school which is changing their cohort numbers. For example, a school which is increasing their grade size from 150 students a year to 180 students a year would be present in this dataset as well as either primary_catch or secondary_catch.
# creating a list with all the id's of duplicated schools
all_f_dups = pd.concat([f_p, f_s])
unique_f_dups = list(all_f_dups["school_id"].unique())
# creating a copy of future_catch
# f_catch will be our cleaned copy of the dataset
f_catch = future_catch.copy()
# dropping all entries in f_catch with school_id in duplicates list
for id in unique_f_dups:
f_catch.drop(f_catch[f_catch["school_id"] == id].index, inplace = True)
f_catch.shape
(5, 4)
Success!
primary_catch and secondary_catch.¶Let's examine which schools these are.
p_s["catch_type_x"].unique()
array(['CENTRAL_PRIMARY'], dtype=object)
p_s["catch_type_y"].unique()
array(['CENTRAL_HIGH'], dtype=object)
It appears that all schools which are both primary and secondary schools come under the CENTRAL school type. According to this NSW Department of Education (DoE) glossary page, a "central school" is
A school containing both primary and secondary departments...characteristic of regional districts where the population is too small to support a single high school.
We could assume that all the central schools in our dataset are in regional areas, but there is the possibility of losing some information in the chance that a central school is present in Greater Sydney. So how do we check that all these central schools which are showing up in both our primary_catch and secondary_catch datasets are indeed rural?
The NSW DoE provides an up-to-date list of all rural schools in NSW in the form of a HTML table. We can scrape this page, filter to find the list of "central" schools, and then cross-reference this with our list of duplicates.
import requests
from bs4 import BeautifulSoup
webpage_source = requests.get("https://education.nsw.gov.au/public-schools/selective-high-schools-and-opportunity-classes/year-7/information-for-applicants/aurora-college/rural-and-remote-high-schools").text
content = BeautifulSoup(webpage_source, 'html5lib')
# the following code conducts the web scraping and stores all school names in a list
tables = content.find_all('div', {"class": "script aem-GridColumn aem-GridColumn--default--12"})
rural_schools = []
for table in tables:
body = table.find("tbody")
trs = body.find_all("tr")
for tr in trs:
tds = tr.find("td")
rural_schools.append(tds.text.strip())
rural_schools = rural_schools[2:] # removing two erroneous entries
Before we can compare the schools in our dataframe of duplicates p_s and our list of rural schools rural_schools, we face one more issue. The school names in p_s are in abbreviated form e.g. "Central School" = "CS" or "Technical High School" = "THS" whereas the school names in rural_schools are in full form. Let's just keep the first word from each entry in both lists and use this to compare between them (usually this word represents the suburb where the school is located).
rural_schools_suburb = []
for school in rural_schools:
suburb = school.split(" ")[0]
rural_schools_suburb.append(suburb)
dups_p_s = sorted(list(p_s["name_x"])) # list of all school names in the dataframe of duplicates
dups_p_s_suburb = []
for school in dups_p_s:
suburb = school.split(" ")[0]
dups_p_s_suburb.append(suburb)
Now it is time to compare. For each school in our dataframe of duplicates, we will check if it is rural or not by checking whether it is present in our list of rural schools. If it isn't rural, we will keep it in our dataset (as it is part of Greater Sydney).
rural_dups = []
non_rural_dups = []
print("List of non-rural schools present in both our primary and secondary datasets:")
for i in range(len(dups_p_s_suburb)):
school = dups_p_s_suburb[i]
if not school in rural_schools_suburb:
non_rural_dups.append(dups_p_s[i])
print(dups_p_s[i])
else:
rural_dups.append(dups_p_s[i])
List of non-rural schools present in both our primary and secondary datasets: Alexandria Park CS Lindfield LV Lucas Hts CS Wadalba CS
Our results seem to be correct aside from Wadalba CS which is a school in the Central Coast. We believe this is a semantic data quality issue — the dataset on the DoE website clearly considers the Central Coast not to be "remote" or "rural" and hence hasn't listed it in their tables, but in our assignment the definition of "remote" or "rural" is anything outside of Greater Sydney. Hence we will not include Wadalbda CS in our dataset.
rural_dups.append("Wadalba CS")
Finally, let's remove all the entries in primary_catch and secondary_catch which are considered rural.
p_catch = primary_catch.copy()
s_catch = secondary_catch.copy()
for school in rural_dups:
p_catch.drop(p_catch[p_catch["name"] == school].index, inplace = True)
s_catch.drop(p_catch[p_catch["name"] == school].index, inplace = True)
Now the only duplicates left in our datasets are Alexandria Park CS, Lindfield LV, and Lucas Hts CS. We must remove these from either the primary_catch or secondary_catch datasets — but ultimately this does not matter as both datasets will be concatenated together in the end. Here we drop them from the primary schools dataset.
for school in non_rural_dups:
p_catch.drop(p_catch[p_catch["name"] == school].index, inplace = True)
if p_catch.merge(s_catch, how = "inner", on = "school_id").empty:
print("There are no more duplicate entries across the primary and secondary datasets.")
else:
print("Duplicates found!")
There are no more duplicate entries across the primary and secondary datasets.
We can now, finally, append each dataset onto each other to create our final dataset school_catchments.
school_catchments = pd.concat([f_catch, p_catch, s_catch])
school_catchments.head(3)
| school_id | catch_type | name | geom | |
|---|---|---|---|---|
| 2 | 4683 | PRIMARY | Murrumbateman PS | MULTIPOLYGON (((149.10556911272894 -34.8952307... |
| 30 | 4678 | PRIMARY | Epping South PS | MULTIPOLYGON (((151.07650930698497 -33.7768606... |
| 39 | 4680 | PRIMARY | Googong PS | MULTIPOLYGON (((149.26653458527704 -35.4983768... |
break_and_enter¶break_enter = gpd.read_file("inputs/internal_datasets/break_and_enter/BreakEnterDwelling_JanToDec2021.shp")
break_enter.head(3)
| OBJECTID | Contour | Density | ORIG_FID | Shape_Leng | Shape_Area | geometry | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | 8.0 | Low Density | 1 | 0.012138 | 0.000006 | POLYGON ((149.91078 -37.06636, 149.91080 -37.0... |
| 1 | 2 | 8.0 | Low Density | 1 | 0.019106 | 0.000015 | POLYGON ((149.90601 -37.05837, 149.90602 -37.0... |
| 2 | 3 | 8.0 | Low Density | 1 | 0.006068 | 0.000002 | POLYGON ((148.94250 -37.04209, 148.94253 -37.0... |
Notes on the dataset:
Contour and ORIG_FID columns are not related to our investigation, and so we can remove these.Shape_Leng and Shape_Area ourselves once the dataset is uploaded to the PostGIS database, and so we can remove these columns as well.OBJECTID will be the primary key.columns = ["Contour", "ORIG_FID", "Shape_Leng", "Shape_Area"]
break_enter.drop(columns = columns, inplace = True)
break_enter.rename(columns = {"OBJECTID": "id", "Density": "density"}, inplace = True)
After checking the SRID, we convert the geometry column into WKT format.
with open("inputs/internal_datasets/break_and_enter/BreakEnterDwelling_JanToDec2021.prj") as f:
for line in f:
print(line)
GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137.0,298.257223563]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]]
# from the .prj file
srid_bne = 4283
break_enter["geom"] = break_enter["geometry"].apply(lambda x: create_wkt_element(geom = x, srid = srid_bne))
break_enter.drop(columns = "geometry", inplace = True) # dropping the old `geometry` column
break_enter.dtypes
id int64 density object geom object dtype: object
Data types look correct.
break_enter.duplicated().sum()
0
No duplicate rows exist.
break_enter.isna().sum()
id 0 density 0 geom 0 dtype: int64
It seems like the dataset is fairly clean and is ready to be pushed to the database.
trees.csv¶The following dataset Street and park trees was taken from the City of Sydney Open Data Hub. The dataset has a custom Open Data licence, which allows us to use it for the purposes of this assignment.
The dataset covers all the park and street trees on the City of Sydney asset register. It is regularly updated: as of writing, it was last updated on 15th May 2022.
trees = pd.read_csv("inputs/external_datasets/trees.csv")
trees.head(3)
| OID_ | asset_id | TreeType | SpeciesName | CommonName | TreeHeight | TreeCanopyEW | TreeCanopyNS | Tree_Status | Longitude | Latitude | ObjectId | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 301 | TP10066 | Park Tree | Murraya paniculata | Orange Jessamine | 2.0 | 2.0 | 3.0 | Tree | 151.183995 | -33.880358 | 1 |
| 1 | 1 | TP00005 | Park Tree | Caesalpinia ferrea | Leopard Tree | 8.0 | 3.0 | 3.0 | Tree | 151.181973 | -33.895638 | 2 |
| 2 | 302 | TP10068 | Park Tree | Murraya paniculata | Orange Jessamine | 2.0 | 2.0 | 3.0 | Tree | 151.183966 | -33.880373 | 3 |
Taking an inital look at the dataset, we note:
Longitude and Latitude seem unrelated to our present investigation and will be removed.Let's check if there are any null values in the dataset.
trees.isna().sum()
OID_ 0 asset_id 154 TreeType 0 SpeciesName 0 CommonName 0 TreeHeight 0 TreeCanopyEW 0 TreeCanopyNS 0 Tree_Status 0 Longitude 0 Latitude 0 ObjectId 0 dtype: int64
asset_id has 154 null values. This would definitely not be a good choice for our primary key! Now we just need to decide between OID_ and ObjectID.
Digging deeper into the metadata of the dataset, we see that the column OID_ has its nullable parameter set to true whereas ObjectId has it set to false. This is a strong indication that ObjectId was probably the designated primary key for the dataset.
Hence we will remove the OID_ and asset_id columns and retain ObjectId as our primary key.
columns = ["OID_", "asset_id", "TreeType", "SpeciesName", "CommonName", "TreeHeight", "TreeCanopyEW", "TreeCanopyNS", "Tree_Status"]
trees.drop(columns = columns, inplace = True)
trees.rename(columns = {"ObjectId": "id"}, inplace = True)
trees.head(3)
| Longitude | Latitude | id | |
|---|---|---|---|
| 0 | 151.183995 | -33.880358 | 1 |
| 1 | 151.181973 | -33.895638 | 2 |
| 2 | 151.183966 | -33.880373 | 3 |
We need to store latitude and longitude as a geometric point. We can do this using geopandas.
trees['geom'] = gpd.points_from_xy(trees.Longitude, trees.Latitude)
trees.drop(columns = ["Longitude", 'Latitude'], inplace = True) # removing the old lat/long columns
Now we will convert these points into WKT format, allowing it to be ingested into PostGIS.
srid_trees = 4326
trees['geom'] = trees['geom'].apply(lambda x: WKTElement(x.wkt, srid = srid_trees))
The dataset is ready to be pushed to the database.
electricity.geojson¶The following dataset Electricity intensity by suburb was taken from the City of Sydney Open Data Hub. The dataset is licensed under Creative Commons Attribution 4.0 International, which allows us to share and adapt the data for the purposes of this assignment.
The dataset describes the electricity intensity data (kWh/m2) by suburb within the City of Sydney local government area. Data ranges from 2005/06 financial year to the latest financial year, and is derived from utility data sets.
We will use geopandas to read the file the .geojson file in.
electricity = gpd.read_file("inputs/external_datasets/electricity.geojson")
electricity.head(3)
| FID | NAME | Shape_Leng | Shape_Area | F2005_06 | F2006_07 | F2007_08 | F2008_09 | F2009_10 | F2010_11 | ... | F2012_13 | F2013_14 | F2014_15 | F2015_16 | F2016_17 | F2017_18 | F2018_19 | Shape__Area | Shape__Length | geometry | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Alexandria | 10168.649178 | 3.523771e+06 | 61.254607 | 61.236810 | 61.330473 | 55.771717 | 52.208182 | 54.831454 | ... | 51.700183 | 47.993058 | 48.153169 | 47.530817 | 47.151695 | 46.551639 | 44.860637 | 5.129712e+06 | 12267.960868 | POLYGON ((151.19913 -33.89517, 151.19913 -33.8... |
| 1 | 2 | Forest Lodge + Annandale | 8654.226944 | 5.457704e+05 | 35.363439 | 35.390011 | 35.934231 | 35.533862 | 37.157023 | 39.169671 | ... | 44.322848 | 43.834806 | 41.674064 | 40.018691 | 47.415063 | 46.953099 | 45.383880 | 7.939479e+05 | 6817.416736 | POLYGON ((151.17727 -33.87272, 151.17728 -33.8... |
| 2 | 3 | Millers Point + Barangaroo | 3944.508809 | 4.634789e+05 | 65.724806 | 65.656440 | 74.529211 | 80.751317 | 87.994537 | 93.138544 | ... | 69.307108 | 49.608122 | 52.891155 | 37.825022 | 33.420168 | 32.414651 | 32.968087 | 6.739312e+05 | 4760.147743 | POLYGON ((151.20467 -33.85663, 151.20468 -33.8... |
3 rows × 21 columns
A few things we note about the dataset:
Shape_LengShape_AreaShape__LengthShape__Areageometry column needs to be converted into the Well-Known Text (WKT) format to allow it to be ingested by the PostGres database.# removing unnecessary columns
electricity.drop(columns = ["FID", "Shape_Leng", "Shape_Area", "Shape__Length", "Shape__Area"], inplace = True)
electricity.rename(columns = {"NAME": 'suburb'}, inplace = True)
Let's convert the geometry column into WKT format.
srid_elec = 4326
electricity["geom"] = electricity["geometry"].apply(lambda x: create_wkt_element(geom = x, srid = srid_elec))
electricity.drop(columns = "geometry", inplace = True) # dropping the old `geometry` column
electricity.dtypes
suburb object F2005_06 float64 F2006_07 float64 F2007_08 float64 F2008_09 float64 F2009_10 float64 F2010_11 float64 F2011_12 float64 F2012_13 float64 F2013_14 float64 F2014_15 float64 F2015_16 float64 F2016_17 float64 F2017_18 float64 F2018_19 float64 geom object dtype: object
It seems all columns have been interpreted correctly.
Let's examine how electricity consumption in City of Sydney suburbs has changed over time.
consumption_plot = electricity.copy()
consumption_plot = consumption_plot.loc[:, consumption_plot.columns != 'geom']
# prepping the dataset for plotting
consumption_plot.set_index("suburb", inplace = True)
consumption_plot = consumption_plot.transpose()
# plotting using plotly.express
fig = px.line(consumption_plot,
title = "Electricity consumption of City of Sydney suburbs over the years",
labels = {
"suburb": "Suburb",
"index": "Year",
"value": "Intensity (kWh/m2)"
})
fig.show()
We can see that for most suburbs, electricity consumption has been following a relatively stable downwards trend over the past two decades. There are of course some outliers; Barangaroo obviously had a very fluctuating electricity usage considering most of it had been under construction across the 2010's. We also don't have very up-to-date data — it would have been interesting to see how COVID-19 affected electricity consumption trends.
Given that there are only a few outlier suburbs, the data seems suitable to be averaged across all years.
# calculating mean of electricity usage over the years
elec_avg = electricity.copy()
elec_avg["usage"] = elec_avg.iloc[:, 1:-1].mean(axis = 1)
# deleting old year-by-year columns
elec_avg.drop(columns = list(elec_avg.iloc[:, 1:-2].columns), inplace = True)
elec_avg.head(3)
| suburb | geom | usage | |
|---|---|---|---|
| 0 | Alexandria | MULTIPOLYGON (((151.199134423001 -33.895167371... | 52.539398 |
| 1 | Forest Lodge + Annandale | MULTIPOLYGON (((151.177265964327 -33.872723472... | 40.651961 |
| 2 | Millers Point + Barangaroo | MULTIPOLYGON (((151.204668817113 -33.856632689... | 62.280418 |
Our dataset is fully cleaned and is ready to be pushed to the PostGres database.
### the following helper functions were taken from DATA2001 Week 4 Tutorial ###
from sqlalchemy import create_engine, inspect
import psycopg2
import psycopg2.extras
import json
credentials = "inputs/Credentials.json"
def pgconnect(credential_filepath, db_schema="public"):
with open(credential_filepath) as f:
db_conn_dict = json.load(f)
host = db_conn_dict['host']
db_user = db_conn_dict['user']
db_pw = db_conn_dict['password']
default_db = db_conn_dict['user']
try:
db = create_engine('postgresql+psycopg2://'+db_user+':'+db_pw+'@'+host+'/'+default_db, echo=False)
conn = db.connect()
print('Connected successfully.')
except Exception as e:
print("Unable to connect to the database.")
print(e)
db, conn = None, None
return db, conn
def query(conn, sqlcmd, args=None, df=True):
result = pd.DataFrame() if df else None
try:
if df:
result = pd.read_sql_query(sqlcmd, conn, params=args)
else:
result = conn.execute(sqlcmd, args).fetchall()
result = result[0] if len(result) == 1 else result
except Exception as e:
print("Error encountered: ", e, sep='\n')
return result
db, conn = pgconnect(credentials)
Connected successfully.
We are going to create our tables in the public schema. We should set the search_path so we do not have to keep on referring to the schema name in our queries.
conn.execute("SET search_path TO public")
<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7fb8ca150610>
greatersydney¶As we will be performing multiple spatial joins on the greatersydney.geom column, it is worth creating an index on this column.
conn.execute('''
DROP TABLE IF EXISTS greatersydney CASCADE;
CREATE TABLE greatersydney (
sa2_code INT PRIMARY KEY,
sa2_name VARCHAR(50) NOT NULL,
sa3_code INT NOT NULL,
sa3_name VARCHAR(50) NOT NULL,
sa4_code INT NOT NULL,
sa4_name VARCHAR(50) NOT NULL,
areasqkm FLOAT,
geom GEOMETRY(MULTIPOLYGON, 4283)
);
''')
<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7fb8ca150be0>
greater_sydney.to_sql("greatersydney",
con = conn,
if_exists = "append",
index = False,
dtype = {"geom": Geometry("MULTIPOLYGON", srid_sa2)})
312
conn.execute('''
CREATE INDEX IF NOT EXISTS gsydney_geom_idx
ON greatersydney
USING GIST (geom);
''')
<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7fb8ca15ba60>
neighbourhoods¶conn.execute('''
DROP TABLE IF EXISTS neighbourhoods CASCADE;
CREATE TABLE neighbourhoods (
sa2_code INT PRIMARY KEY,
sa2_name VARCHAR(50) NOT NULL,
population INT,
young_pop INT,
med_ann_house_income FLOAT,
avg_monthly_rent FLOAT,
accom_food INT,
retail INT,
health INT
);
''')
<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7fb91b040eb0>
neighbourhood_stats.to_sql("neighbourhoods",
con = conn,
if_exists = "replace",
index = False)
312
schools¶conn.execute('''
DROP TABLE IF EXISTS schools CASCADE;
CREATE TABLE schools (
school_id INT PRIMARY KEY,
catch_type VARCHAR(50) NOT NULL,
name VARCHAR(50) NOT NULL,
geom GEOMETRY(MULTIPOLYGON, 4283)
);
''')
<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7fb8d0047eb0>
school_catchments.to_sql("schools",
con = conn,
if_exists = "append",
index = False,
dtype = {"geom": Geometry("MULTIPOLYGON", srid_school)})
43
crime¶conn.execute('''
DROP TABLE IF EXISTS crime CASCADE;
CREATE TABLE crime (
id INT PRIMARY KEY,
density VARCHAR(50) NOT NULL,
geom GEOMETRY(MULTIPOLYGON, 4283)
);
''')
<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7fb8d001e400>
break_enter.to_sql("crime",
con = conn,
if_exists = "append",
index = False,
dtype = {"geom": Geometry("MULTIPOLYGON", srid_bne)})
594
electricity¶conn.execute('''
DROP TABLE IF EXISTS electricity CASCADE;
CREATE TABLE electricity (
suburb VARCHAR(50) PRIMARY KEY,
usage FLOAT,
geom GEOMETRY(MULTIPOLYGON, 4326)
);
''')
<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7fb8d0052e20>
elec_avg.to_sql("electricity",
con = conn,
if_exists = "append",
index = False,
dtype = {"geom": Geometry("MULTIPOLYGON", srid_elec)})
29
trees¶We will create an index on the trees.geom column for similar reasons as greatersydney.
conn.execute('''
DROP TABLE IF EXISTS trees CASCADE;
CREATE TABLE trees (
id INTEGER PRIMARY KEY,
geom GEOMETRY(POINT, 4326)
);
CREATE INDEX IF NOT EXISTS trees_geom_idx
ON trees
USING GIST
(ST_Transform(geom, 4283));
''')
<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7fb91921f8b0>
trees.to_sql("trees",
con = conn,
if_exists = "append",
index = False,
dtype = {"geom": Geometry("POINT", srid_trees)})
155
We first join the crime and the greatersydney tables. This joined data set informs us of the SA2 regions of the density hotspots, which is useful for the calculation of crime hotspot areas for each neighbourhood.
Note: many hotspots were not fully contained within a SA2 Region but were rather part of many regions. Such hotspots were divided into smaller hotspots which are contained within one region. Hence, upon using the ST_Intersects method below, multiple entries seemed to share the same id due to a singular hotspot being dissected across multiple SA2 regions.
This is not ideal as id is meant to be unique per entry. To overcome this, we create a new id using the ROW_NUMBER() function.
sql = """
DROP VIEW IF EXISTS crimesydney CASCADE;
CREATE VIEW crimesydney AS (
SELECT ROW_NUMBER() OVER (PARTITION BY C.density ORDER BY C.id ASC) AS id,
GS.sa2_code,
GS.sa2_name,
GS.areasqkm,
C.density,
ST_Intersection(GS.geom, C.geom) AS geom,
ST_Area(ST_Intersection(GS.geom, C.geom)) AS area
FROM greatersydney GS JOIN crime C ON ST_Intersects(GS.geom, C.geom)
);
SELECT * FROM crimesydney
"""
crime_sydney = gpd.read_postgis(sql, conn)
crime_sydney.head(3)
| id | sa2_code | sa2_name | areasqkm | density | geom | area | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | 123021436 | Bradbury - Wedderburn | 36.6555 | High Density | POLYGON ((150.82328 -34.08944, 150.82330 -34.0... | 2.911138e-05 |
| 1 | 2 | 123021444 | Rosemeadow - Glen Alpine | 48.0827 | High Density | POLYGON ((150.79940 -34.08926, 150.79942 -34.0... | 2.544203e-05 |
| 2 | 3 | 123021436 | Bradbury - Wedderburn | 36.6555 | High Density | POLYGON ((150.80612 -34.08702, 150.80621 -34.0... | 1.932130e-07 |
len(list(crime_sydney["sa2_name"].unique()))
284
Note that it appears only 284 out of 312 SA2 regions have recorded hotspots.
The following section contains a multitude of SQL queries on the crime dataset to understand how the dataset works in terms of low, medium and high densities.
Since the three types of density hotspots can be overlayed, certain hotspots can contain the area of other hotspots. Hence, as we do not wish to double up on our calculated areas, we must see what contours are contained which other contours.
Note: the below queries look long. In reality, they simply use window functions to compute the same spatial join between the greatersydney and crime dataset for different densities of crime contours. The results from these window functions are then inter-joined. This was preferred over storing the window functions as views or tables.
Let's check if low density hotspots can exist within a medium density hotspot.
sql = """
WITH lowcrime AS (
SELECT ROW_NUMBER() OVER (ORDER BY C.id ASC) AS id,
ST_Intersection(GS.geom, C.geom) AS geom,
ST_Area(ST_Intersection(GS.geom, C.geom)) AS area
FROM greatersydney GS JOIN crime C ON ST_Intersects(GS.geom, C.geom)
WHERE C.density = 'Low Density'
), medcrime AS (
SELECT ROW_NUMBER() OVER (ORDER BY C.id ASC) AS id,
ST_Intersection(GS.geom, C.geom) AS geom,
ST_Area(ST_Intersection(GS.geom, C.geom)) AS area
FROM greatersydney GS JOIN crime C ON ST_Intersects(GS.geom, C.geom)
WHERE C.density = 'Medium Density'
)
SELECT L.id AS id_low,
L.geom AS geom_low,
L.area AS area_low,
M.id AS id_med,
M.geom AS geom_med,
M.area AS area_med
FROM medcrime M JOIN lowcrime L ON ST_Contains(M.geom, L.geom)
"""
med_low_overlap = query(conn, sql)
med_low_overlap.head()
| id_low | geom_low | area_low | id_med | geom_med | area_med | |
|---|---|---|---|---|---|---|
| 0 | 313 | 0103000020BB100000010000003F010000B408A2474CE6... | 0.000211 | 277 | 0103000020BB100000010000003F010000B408A2474CE6... | 0.000211 |
| 1 | 291 | 0103000020BB10000001000000A20000007CFD266DAAE6... | 0.000128 | 278 | 0103000020BB10000001000000A20000007CFD266DAAE6... | 0.000128 |
Interestingly there are two instances of low density hotspots within medium densities. Upon closer inspection both seem to be the same hotspots in terms of their geom objects, with the only difference being their densities. Let's check this.
if med_low_overlap["geom_low"].equals(med_low_overlap["geom_med"]):
print("These geometries are equal.")
else:
print("These are unequal geometries!")
These geometries are equal.
The above output shows that both densities are the same — one does not contain the other. This means that their areas will be equal and deduction of inner (contained) areas is not necessary for these special hotspots.
# storing the outlier hotspots for later use
outlier_med = med_low_overlap["id_med"].tolist()
outlier_low = med_low_overlap["id_low"].tolist()
Let's check if low density hotspots can exist within a high density hotspot.
sql = """
WITH lowcrime AS (
SELECT ROW_NUMBER() OVER (ORDER BY C.id ASC) AS id,
ST_Intersection(GS.geom, C.geom) AS geom,
ST_Area(ST_Intersection(GS.geom, C.geom)) AS area
FROM greatersydney GS JOIN crime C ON ST_Intersects(GS.geom, C.geom)
WHERE C.density = 'Low Density'
), highcrime AS (
SELECT ROW_NUMBER() OVER (ORDER BY C.id ASC) AS id,
ST_Intersection(GS.geom, C.geom) AS geom,
ST_Area(ST_Intersection(GS.geom, C.geom)) AS area
FROM greatersydney GS JOIN crime C ON ST_Intersects(GS.geom, C.geom)
WHERE C.density = 'High Density'
)
SELECT L.id AS id_low,
L.geom AS geom_low,
L.area AS area_low,
H.id AS id_high,
H.geom AS geom_high,
H.area AS area_high
FROM highcrime H JOIN lowcrime L ON ST_Contains(H.geom, L.geom)
"""
high_low_overlap = query(conn, sql)
high_low_overlap.head()
| id_low | geom_low | area_low | id_high | geom_high | area_high |
|---|
With the exception of the 'outlier' case where the medium density was identical to the low density (not contained) the above output has proven that low densities hotspots cannot be contained within other hotspots.
Next, in order to calculate the areas of low density hotspots, the possibility of other hotspots existing within the low density hotspots needs to considered so that deduction of overlapping areas can be performed where necessary.
Let's check if medium density hotspots can exist within a low density hotspot.
sql = """
WITH lowcrime AS (
SELECT ROW_NUMBER() OVER (ORDER BY C.id ASC) AS id,
ST_Intersection(GS.geom, C.geom) AS geom,
ST_Area(ST_Intersection(GS.geom, C.geom)) AS area
FROM greatersydney GS JOIN crime C ON ST_Intersects(GS.geom, C.geom)
WHERE C.density = 'Low Density'
), medcrime AS (
SELECT ROW_NUMBER() OVER (ORDER BY C.id ASC) AS id,
ST_Intersection(GS.geom, C.geom) AS geom,
ST_Area(ST_Intersection(GS.geom, C.geom)) AS area
FROM greatersydney GS JOIN crime C ON ST_Intersects(GS.geom, C.geom)
WHERE C.density = 'Medium Density'
)
SELECT L.id AS id_low,
L.geom AS geom_low,
L.area AS area_low,
M.id AS id_med,
M.geom AS geom_med,
M.area AS area_med
FROM lowcrime L JOIN medcrime M ON ST_Contains(L.geom, M.geom)
"""
low_med_overlap = query(conn, sql)
low_med_overlap.shape
(320, 6)
Let's check if medium density hotspots can exist within a high density hotspot.
sql = """
WITH medcrime AS (
SELECT ROW_NUMBER() OVER (ORDER BY C.id ASC) AS id,
ST_Intersection(GS.geom, C.geom) AS geom,
ST_Area(ST_Intersection(GS.geom, C.geom)) AS area
FROM greatersydney GS JOIN crime C ON ST_Intersects(GS.geom, C.geom)
WHERE C.density = 'Medium Density'
), highcrime AS (
SELECT ROW_NUMBER() OVER (ORDER BY C.id ASC) AS id,
ST_Intersection(GS.geom, C.geom) AS geom,
ST_Area(ST_Intersection(GS.geom, C.geom)) AS area
FROM greatersydney GS JOIN crime C ON ST_Intersects(GS.geom, C.geom)
WHERE C.density = 'High Density'
)
SELECT M.id AS id_med,
M.geom AS geom_med,
M.area AS area_med,
H.id AS id_high,
H.geom AS geom_high,
H.area AS area_high
FROM medcrime M JOIN highcrime H ON ST_Contains(H.geom, M.geom)
"""
high_med_overlap = query(conn, sql)
high_med_overlap.head()
| id_med | geom_med | area_med | id_high | geom_high | area_high |
|---|
The above output has proven high density hotspots cannot contain any other hotspots.
sql = """
WITH lowcrime AS (
SELECT ROW_NUMBER() OVER (ORDER BY C.id ASC) AS id,
ST_Intersection(GS.geom, C.geom) AS geom,
ST_Area(ST_Intersection(GS.geom, C.geom)) AS area
FROM greatersydney GS JOIN crime C ON ST_Intersects(GS.geom, C.geom)
WHERE C.density = 'Low Density'
), highcrime AS (
SELECT ROW_NUMBER() OVER (ORDER BY C.id ASC) AS id,
ST_Intersection(GS.geom, C.geom) AS geom,
ST_Area(ST_Intersection(GS.geom, C.geom)) AS area
FROM greatersydney GS JOIN crime C ON ST_Intersects(GS.geom, C.geom)
WHERE C.density = 'High Density'
)
SELECT L.id AS id_low,
L.geom AS geom_low,
L.area AS area_low,
H.id AS id_high,
H.geom AS geom_high,
H.area AS area_high
FROM lowcrime L JOIN highcrime H ON ST_Contains(L.geom, H.geom)
"""
low_high_overlap = query(conn, sql)
low_high_overlap.shape
(188, 6)
The above output confirms that high densities exist within low density hotspots. However, for the purposes of calculating density area, it would be more useful to investigate if medium densities contain high densities.
sql = """
WITH medcrime AS (
SELECT ROW_NUMBER() OVER (ORDER BY C.id ASC) AS id,
ST_Intersection(GS.geom, C.geom) AS geom,
ST_Area(ST_Intersection(GS.geom, C.geom)) AS area
FROM greatersydney GS JOIN crime C ON ST_Intersects(GS.geom, C.geom)
WHERE C.density = 'Medium Density'
), highcrime AS (
SELECT ROW_NUMBER() OVER (ORDER BY C.id ASC) AS id,
ST_Intersection(GS.geom, C.geom) AS geom,
ST_Area(ST_Intersection(GS.geom, C.geom)) AS area
FROM greatersydney GS JOIN crime C ON ST_Intersects(GS.geom, C.geom)
WHERE C.density = 'High Density'
)
SELECT M.id AS id_med,
M.geom AS geom_med,
M.area AS area_med,
H.id AS id_high,
H.geom AS geom_high,
H.area AS area_high
FROM medcrime M JOIN highcrime H ON ST_Contains(M.geom, H.geom)
"""
med_high_overlap = query(conn, sql)
med_high_overlap.shape
(188, 6)
The above output shows that there are 188 unique instances of high densities existing within medium density hotspots. Interestingly there are 188 unique instances of high densities within low densities as well. This could mean that all the high density hotspots within low density hotspots are contained within some medium density hotspot. If that is the case, then the calculation of low density areas does not need to consider the areas of high density hotspots contained within low density ones, as the areas of such high density hotspots will be apart of the medium density hotspots within low density hotspots.
Let's check this.
# storing id's of high density contours within medium density contours
high_in_med_id = med_high_overlap["id_high"].tolist()
# storing id's of high density contours within low density contours
high_in_low_id = low_high_overlap["id_high"].to_list()
# checking if any high densities within low densities do not exist inside of a medium density
differ = []
for low_id in high_in_low_id:
if low_id not in high_in_med_id:
differ.append(low_id)
print(low_id)
344
The above output confirms that one high density hotspot of id_high = 344 is inside of a low density hotspot but not in a medium density hotspot. Conversely, there exists a low density hotspot with a high density hotspot but no medium density hotspot. We should note this outlier.
low_with_one_high_id = low_high_overlap[low_high_overlap["id_high"] == differ[0]]["id_low"].to_list()[0]
high_area_for_low = low_high_overlap[low_high_overlap["id_high"] == differ[0]]["area_high"].to_list()[0]
Finding the area of medium density hotspots within lower density hotspots (stored in low_minus_area dictionary).
Note: code below uses med_in_low table from earlier which contains all medium density hotspots within low density hotspots.
outlier_low
[313, 291]
low_minus_area = {}
i = 0
while i < len(low_med_overlap):
low_object_id = low_med_overlap.loc[i]["id_low"]
med_area = low_med_overlap.loc[i]["area_med"]
# if the low density hotspot is one of the 'outlier' densities from earlier
# then it shares its area with its corresponding 'containing' medium density
if low_object_id in outlier_low:
low_minus_area[low_object_id] = [0]
elif low_object_id not in low_minus_area:
low_minus_area[low_object_id] = [med_area]
else:
low_minus_area[low_object_id].append(med_area)
i += 1
Recall there existing a single high density present within a low density. Such area of high density needs to be added to low_minus_area and since high densities do not contain any other hotspots, their areas do not require any calculations.
if low_with_one_high_id in low_minus_area:
low_minus_area[low_with_one_high_id].append(high_area_for_low)
else:
low_minus_area[low_with_one_high] = [high_area_for_low]
for density_id in low_minus_area.keys():
low_minus_area[density_id] = sum(low_minus_area[density_id])
sql = """
SELECT * FROM (
SELECT ROW_NUMBER() OVER (ORDER BY C.id ASC) AS id,
ST_Intersection(GS.geom, C.geom) AS geom,
ST_Area(ST_Intersection(GS.geom, C.geom)) AS area
FROM greatersydney GS JOIN crime C ON ST_Intersects(GS.geom, C.geom)
WHERE C.density = 'Low Density'
) lowcrime
"""
low_density_table = query(conn, sql)
i = 0
low_area_new = []
while i < len(low_density_table):
low_object_id = low_density_table.loc[i]["id"]
low_area = low_density_table.loc[i]["area"]
if low_object_id in low_minus_area.keys():
area = low_area - low_minus_area[low_object_id]
low_area_new.append(area)
# this else condition will run for low densities which do not contain any density hotspots
# hence do not need any subtract as their area does not contain any other area
else:
low_area_new.append(low_area)
i += 1
low_density_table["low_area"] = low_area_new
low_density_areas = low_density_table[["id", "area"]]
low_density_areas.head(3)
| id | area | |
|---|---|---|
| 0 | 1 | 0.000015 |
| 1 | 2 | 0.000006 |
| 2 | 3 | 0.000003 |
The areas of the various density hotspots needs to be integerated into a dataset/table that associates each density areas with a suburb in Greater Sydney. Since such a table has already been created(crime_sydney), a collection to contain the density areas needs to be created for data integration. Since each type of density area is being calculated separately a collection is needed for each type of density:
low_density_areas_dict = {}
i = 0
while i < len(low_density_areas):
low_object_id = low_density_table.loc[i]["id"]
low_area = low_density_table.loc[i]["area"]
low_density_areas_dict[low_object_id] = low_area
i += 1
# checking for any negative values (checking for potential flaw in code/ data)
(low_density_areas < 0).values.any()
False
Next in order to find the areas of medium density hotspots, the areas of the high density hotspots need to be found first. Since, high density hotspots do not contain any other hotspots, their areas do not need calculation/alteration.
sql = """
SELECT * FROM (
SELECT ROW_NUMBER() OVER (ORDER BY C.id ASC) AS id,
ST_Intersection(GS.geom, C.geom) AS geom,
ST_Area(ST_Intersection(GS.geom, C.geom)) AS area
FROM greatersydney GS JOIN crime C ON ST_Intersects(GS.geom, C.geom)
WHERE C.density = 'High Density'
) highcrime;
"""
high_density_table = query(conn, sql)
high_density_areas = high_density_table[["id", "area"]]
high_density_areas.head(3)
| id | area | |
|---|---|---|
| 0 | 1 | 2.911138e-05 |
| 1 | 2 | 2.544203e-05 |
| 2 | 3 | 1.932130e-07 |
Similar to the low densities,the high density hotspot areas need to be added to a new collection (high_density_areas_dict) (note a weighting of '2' is given to high density hotspots in an attempt to differentiate the severity of low, medium and high density hotspots):
i = 0
high_density_areas_dict = {}
while i < len(high_density_areas):
high_object_id = high_density_areas.loc[i]["id"]
high_area = high_density_areas.loc[i]["area"]
high_density_areas_dict[high_object_id] = 2 * high_area
i += 1
Similar to low density hotspots, medium intensity hotspots have the possibility of containing more than one high density hotspot. Hence the total area of such high density hotspots needs to be calculated (except for the outlier case).
Below is the code that finds the total area of such inner high density hotspots (stored in med_minus_area dictionary).
med_minus_area = {}
i = 0
while i < len(med_high_overlap):
med_object_id = med_high_overlap.loc[i]["id_med"]
high_area = med_high_overlap.loc[i]["area_high"]
if med_object_id not in med_minus_area:
med_minus_area[med_object_id] = [high_area]
else:
med_minus_area[med_object_id].append(high_area)
i += 1
for density_id in med_minus_area.keys():
med_minus_area[density_id] = sum(med_minus_area[density_id])
sql = """
SELECT * FROM (
SELECT ROW_NUMBER() OVER (ORDER BY C.id ASC) AS id,
ST_Intersection(GS.geom, C.geom) AS geom,
ST_Area(ST_Intersection(GS.geom, C.geom)) AS area
FROM greatersydney GS JOIN crime C ON ST_Intersects(GS.geom, C.geom)
WHERE C.density = 'Medium Density'
) highcrime;
"""
med_density_table = query(conn, sql)
i = 0
med_area_new = []
while i < len(med_density_table):
med_object_id = med_density_table.loc[i]["id"]
med_area = med_density_table.loc[i]["area"]
if med_object_id in med_minus_area.keys():
area = med_area - med_minus_area[med_object_id]
med_area_new.append(area)
else:
med_area_new.append(med_area)
i += 1
med_density_table["med_area"] = med_area_new
med_density_areas = med_density_table[["id", "area"]]
med_density_areas.head(3)
| id | area | |
|---|---|---|
| 0 | 1 | 0.000005 |
| 1 | 2 | 0.000010 |
| 2 | 3 | 0.000002 |
Adding the medium density hotspot areas to med_density_areas_dict (note that a weighting of '1.5' is given to the medium density):
i = 0
med_density_areas_dict = {}
while i < len(med_density_areas):
med_object_id = med_density_areas.loc[i]["id"]
med_shape_area = med_density_areas.loc[i]["area"]
med_density_areas_dict[med_object_id] = 1.5 * med_area
i += 1
Let's check if our density collections have stored all our data.
len(low_density_areas_dict) + len(med_density_areas_dict) + len(high_density_areas_dict)
1902
Perfect.
crime_sydney with density areas¶crime_sydney.head(3)
| id | sa2_code | sa2_name | areasqkm | density | geom | area | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | 123021436 | Bradbury - Wedderburn | 36.6555 | High Density | POLYGON ((150.82328 -34.08944, 150.82330 -34.0... | 2.911138e-05 |
| 1 | 2 | 123021444 | Rosemeadow - Glen Alpine | 48.0827 | High Density | POLYGON ((150.79940 -34.08926, 150.79942 -34.0... | 2.544203e-05 |
| 2 | 3 | 123021436 | Bradbury - Wedderburn | 36.6555 | High Density | POLYGON ((150.80612 -34.08702, 150.80621 -34.0... | 1.932130e-07 |
Replacing existing area column, which contains the row's corresponding density's area, with the actual areas of the densities stored in density_areas_dict:
i = 0
area_new = []
while i < len(high_density_areas_dict):
object_id = crime_sydney.loc[i]["id"]
#st_area = crime_sydney.loc[i]["area"]
area = high_density_areas_dict[object_id]
area_new.append(area)
i += 1
while i < (len(low_density_areas_dict) + len(high_density_areas_dict)) :
object_id = crime_sydney.loc[i]["id"]
#st_area = crime_sydney.loc[i]["area"]
area = low_density_areas_dict[object_id]
area_new.append(area)
i += 1
while i < (len(low_density_areas_dict) + len(high_density_areas_dict) + len(med_density_areas_dict)):
object_id = crime_sydney.loc[i]["id"]
#st_area = crime_sydney.loc[i]["area"]
area = med_density_areas_dict[object_id]
area_new.append(area)
i += 1
crime_sydney["area"] = area_new
crime_sydney.head(3)
| id | sa2_code | sa2_name | areasqkm | density | geom | area | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | 123021436 | Bradbury - Wedderburn | 36.6555 | High Density | POLYGON ((150.82328 -34.08944, 150.82330 -34.0... | 5.822276e-05 |
| 1 | 2 | 123021444 | Rosemeadow - Glen Alpine | 48.0827 | High Density | POLYGON ((150.79940 -34.08926, 150.79942 -34.0... | 5.088406e-05 |
| 2 | 3 | 123021436 | Bradbury - Wedderburn | 36.6555 | High Density | POLYGON ((150.80612 -34.08702, 150.80621 -34.0... | 3.864260e-07 |
accom, retail and health¶The following SQL query creates a new view (rather than a table, to save database resources) which stores the z-scores for the above three categories. It uses 2 WITH clauses to organise the query: perthousand calculates the "per 1000 people" part of the query and stats calculates the mean and standard deviation of each column. Finally, the main query calculates the z-scores.
sql = """
CREATE OR REPLACE VIEW neighbourhood_z AS (
WITH stats AS (
WITH perthousand AS (
SELECT sa2_code,
sa2_name,
accom_food/(population/1000) AS accom_1000,
retail/(population/1000) AS retail_1000,
health/(population/1000) AS health_1000
FROM neighbourhoods
)
SELECT sa2_code,
sa2_name,
accom_1000,
retail_1000,
health_1000,
(SELECT AVG(accom_1000) AS accom_avg FROM perthousand),
(SELECT STDDEV(accom_1000) AS accom_std FROM perthousand),
(SELECT AVG(retail_1000) AS retail_avg FROM perthousand),
(SELECT STDDEV(retail_1000) AS retail_std FROM perthousand),
(SELECT AVG(health_1000) AS health_avg FROM perthousand),
(SELECT STDDEV(health_1000) AS health_std FROM perthousand)
FROM perthousand
)
SELECT sa2_code,
sa2_name,
(accom_1000 - accom_avg)/accom_std AS accom_z,
(retail_1000 - retail_avg)/retail_std AS retail_z,
(health_1000 - health_avg)/health_std AS health_z
FROM stats
);
SELECT * FROM neighbourhood_z;
"""
neighbourhood_z = query(conn, sql)
neighbourhood_z.head(1)
| sa2_code | sa2_name | accom_z | retail_z | health_z | |
|---|---|---|---|---|---|
| 0 | 102011028 | Avoca Beach - Copacabana | 0.130115 | -0.277592 | 0.330901 |
school¶The following SQL query aggregates the number of school catchment areas in each SA2 region. The query makes use of the ST_Intersects function which checks if two (multi)polygons intersect. By definition from the documentation: "If a geometry or geography shares any portion of space then they intersect." It returns the count of school catchment areas as well as the number of young people in the SA2 neighbourhood.
Note that this time we use Python to calculate the z-scores rather than SQL (as we did above).
sql = """
CREATE OR REPLACE VIEW sa2schoolcounts AS (
SELECT GS.sa2_code,
GS.sa2_name,
GS.geom,
COUNT(S.name) AS num_school_catch,
N.young_pop
FROM schools S JOIN greatersydney GS ON ST_Intersects(S.geom, GS.geom)
JOIN neighbourhoods N ON (GS.sa2_code = N.sa2_code AND
GS.sa2_name = N.sa2_name)
GROUP BY GS.sa2_code, GS.sa2_name, GS.geom, N.young_pop
);
SELECT * FROM sa2schoolcounts;
"""
sa2schoolcounts = gpd.read_postgis(sql, conn)
Before we go any further, it is important to remember that there are still some "unliveable" neighbourhoods in this dataset for which we have just calculated the number of school catchments for. We should change these to NA before calculating the z-scores.
for neighbourhood in empty_neighbourhoods:
sa2schoolcounts.loc[sa2schoolcounts["sa2_name"] == neighbourhood, "num_school_catch"] = np.nan
# calculating "number of school catchments per 1000 young people"
sa2schoolcounts["school_catch_1000"] = sa2schoolcounts["num_school_catch"] / (sa2schoolcounts["young_pop"]/1000)
# calculating the z-score of "number of school catchments per 1000 young people"
sa2schoolcounts["school_z"] = (sa2schoolcounts["school_catch_1000"] - sa2schoolcounts["school_catch_1000"].mean()) / sa2schoolcounts["school_catch_1000"].std()
school_z = sa2schoolcounts.copy()[["sa2_code", "sa2_name", "geom", "school_z"]]
school_z.head(1)
| sa2_code | sa2_name | geom | school_z | |
|---|---|---|---|---|
| 0 | 102011028 | Avoca Beach - Copacabana | MULTIPOLYGON (((151.41373 -33.46559, 151.41361... | -0.296441 |
crime¶Calculating z-scores for crime_sydney:
i = 0
sa2_density_areas = {}
sa2_region_areas = {}
while i < len(crime_sydney):
sa2_region = crime_sydney.loc[i]["sa2_name"]
sa2_region_area = crime_sydney.loc[i]["areasqkm"]
area = crime_sydney.loc[i]["area"]
if sa2_region not in sa2_density_areas:
sa2_density_areas[sa2_region] = [area]
else:
sa2_density_areas[sa2_region].append(area)
if sa2_region not in sa2_region_areas:
sa2_region_areas[sa2_region] = sa2_region_area
i += 1
for density_area in sa2_density_areas.keys():
sa2_density_areas[density_area] = sum(sa2_density_areas[density_area])/ sa2_region_areas[density_area]
crime = pd.DataFrame.from_dict(sa2_density_areas, orient = 'index', columns = ['total_density_areas']).reset_index()
crime.rename(columns = {"index": "sa2_name"}, inplace = True)
crime.head(3)
| sa2_name | total_density_areas | |
|---|---|---|
| 0 | Bradbury - Wedderburn | 0.000011 |
| 1 | Rosemeadow - Glen Alpine | 0.000006 |
| 2 | Leumeah - Minto Heights | 0.000016 |
crime["crime_z"] = (crime["total_density_areas"] - crime["total_density_areas"].mean()) / crime["total_density_areas"].std()
crime.drop(columns = "total_density_areas", inplace = True)
crime.head(3)
| sa2_name | crime_z | |
|---|---|---|
| 0 | Bradbury - Wedderburn | -0.652589 |
| 1 | Rosemeadow - Glen Alpine | -0.763217 |
| 2 | Leumeah - Minto Heights | -0.522813 |
Similar to our schools dataset, this dataset still contains data for neighbourhoods deemed "unliveable." Let's empty these entries.
for neighbourhood in empty_neighbourhoods:
crime.loc[crime["sa2_name"] == neighbourhood, "crime_z"] = np.nan
First we merge school_z with accom_z, retail_z, and health_z.
all_z = school_z.merge(neighbourhood_z)[["sa2_code", "sa2_name", "geom", "school_z", "accom_z", "retail_z", "health_z"]]
all_z.shape
(312, 7)
Then we merge this dataframe with crime_z.
# using left merge as crime dataset doesn't contain data for some neighbourhoods
all_z = all_z.merge(crime, how = "left")
all_z.head(3)
| sa2_code | sa2_name | geom | school_z | accom_z | retail_z | health_z | crime_z | |
|---|---|---|---|---|---|---|---|---|
| 0 | 102011028 | Avoca Beach - Copacabana | MULTIPOLYGON (((151.41373 -33.46559, 151.41361... | -0.296441 | 0.130115 | -0.277592 | 0.330901 | NaN |
| 1 | 102011029 | Box Head - MacMasters Beach | MULTIPOLYGON (((151.35398 -33.49854, 151.35397... | -0.196190 | -0.474578 | -0.386652 | -0.493699 | -0.866643 |
| 2 | 102011030 | Calga - Kulnura | MULTIPOLYGON (((151.20460 -33.53298, 151.20456... | 6.431615 | -0.260413 | 0.626528 | -0.790237 | NaN |
Compute the 'liveability' score for all given neighbourhoods according to the following formula:
$$ \text{Score} = \mathcal{S} (z_\text{school} + z_\text{accomm} + z_\text{retail} - z_\text{crime} + z_\text{health}) $$Note that $\mathcal{S}$ here stands for the sigmoid function:
A sigmoid function is a mathematical function having a characteristic "S"-shaped curve or sigmoid curve. A common example of a sigmoid function is the logistic function shown in the first figure and defined by the formula:
$$S(x) = \frac{1}{1+e^{-x}}$$_Taken from Wikipedia_
Essentially the sigmoid function ensures all our scores are between 0 and 1. Let's apply this transformation now.
# summing z scores based on whether they are a positive or negative influence
all_z["sum_z"] = all_z["school_z"] + all_z["accom_z"] + all_z["retail_z"] - all_z["crime_z"] + all_z["health_z"]
# applying sigmoid function
all_z["score"] = 1 / (1 + np.exp(-all_z["sum_z"]))
all_z.drop(columns = "sum_z", inplace = True)
bottom_10 = all_z.drop(columns = ["sa2_code", "geom"]).sort_values(by = "score").head(10)
top_10 = all_z.drop(columns = ["sa2_code", "geom"]).sort_values(by = "score", ascending = False).head(10)
bottom_10
| sa2_name | school_z | accom_z | retail_z | health_z | crime_z | score | |
|---|---|---|---|---|---|---|---|
| 292 | Lurnea - Cartwright | -0.138907 | -0.970797 | -0.669064 | -0.949926 | 2.031976 | 0.008487 |
| 266 | Ashcroft - Busby - Miller | -0.224581 | -0.647061 | -0.958476 | -0.829780 | 1.945898 | 0.009895 |
| 59 | Bidwill - Hebersham - Emerton | -0.268296 | -0.795425 | -0.907292 | -1.029701 | 1.599614 | 0.009949 |
| 216 | Jamisontown - South Penrith | -0.611439 | -0.595825 | -0.151342 | -0.811183 | 2.099183 | 0.013803 |
| 226 | Colyton - Oxley Park | -0.207460 | -0.769546 | -0.955656 | -1.011461 | 1.259610 | 0.014720 |
| 53 | Blacktown (West) | -0.525910 | -0.350772 | -0.649963 | -0.921800 | 1.283438 | 0.023388 |
| 241 | Granville - Clyde | -0.249790 | -0.026734 | 0.263569 | -0.663730 | 3.005945 | 0.024539 |
| 240 | Fairfield - East | -0.234078 | -0.589480 | 0.051823 | -0.894622 | 1.927641 | 0.026753 |
| 238 | Oatlands - Dundas Valley | -0.273596 | -0.279584 | -0.082160 | -0.400782 | 2.455798 | 0.029543 |
| 62 | Lethbridge Park - Tregear | -0.332054 | -0.816666 | -0.984275 | -1.105144 | 0.160986 | 0.032323 |
top_10
| sa2_name | school_z | accom_z | retail_z | health_z | crime_z | score | |
|---|---|---|---|---|---|---|---|
| 83 | Sydney - Haymarket - The Rocks | 0.162730 | 13.539562 | 10.357695 | 6.806156 | 2.040382 | 1.000000 |
| 172 | North Sydney - Lavender Bay | -0.349317 | 4.002368 | 2.131315 | 3.260226 | 0.635795 | 0.999777 |
| 154 | St Leonards - Naremburn | 0.585478 | 1.646284 | 1.234460 | 3.391675 | 0.105746 | 0.998833 |
| 75 | Darlinghurst | 3.005269 | 3.177448 | 1.304050 | 3.357766 | 4.195972 | 0.998706 |
| 87 | Bondi Junction - Waverly | -0.125770 | 1.541704 | 1.104199 | 3.954839 | 0.761719 | 0.996709 |
| 34 | Castle Hill - Central | 0.592099 | 0.541350 | 2.242515 | 1.024314 | -0.668716 | 0.993751 |
| 43 | Bilpin - Colo - St Albans | 5.359601 | 0.036422 | -0.176466 | -1.077078 | -0.889670 | 0.993518 |
| 27 | Tuggerah - Kangy Angy | 1.055286 | 1.068676 | 1.450235 | 0.318830 | -0.840142 | 0.991278 |
| 31 | Baulkham Hills (West) - Bella Vista | -0.266617 | 0.754972 | 0.851624 | 3.019700 | -0.369298 | 0.991242 |
| 219 | Penrith | 0.180875 | 0.806723 | 1.295028 | 1.726758 | -0.468351 | 0.988768 |
Just from the suburb names we can very clearly see an overrepresentation of south-western and western Sydney suburbs in the bottom 10 list, and inversely an overrepresentation of north-western and eastern suburbs in the top 10 list. The inequality divide across Sydney is truly still alive.
Let's correlate these results with the median rent and median income of each neighbourhood.
all_z = all_z.merge(neighbourhoods)[["sa2_code", "sa2_name", "geom", "school_z", "accom_z", "retail_z", "health_z", "crime_z", "score", "med_ann_house_income", "avg_monthly_rent"]]
all_z.head(3)
| sa2_code | sa2_name | geom | school_z | accom_z | retail_z | health_z | crime_z | score | med_ann_house_income | avg_monthly_rent | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 102011028 | Avoca Beach - Copacabana | MULTIPOLYGON (((151.41373 -33.46559, 151.41361... | -0.296441 | 0.130115 | -0.277592 | 0.330901 | NaN | NaN | 46996.0 | 1906.0 |
| 1 | 102011029 | Box Head - MacMasters Beach | MULTIPOLYGON (((151.35398 -33.49854, 151.35397... | -0.196190 | -0.474578 | -0.386652 | -0.493699 | -0.866643 | 0.335263 | 42621.0 | 1682.0 |
| 2 | 102011030 | Calga - Kulnura | MULTIPOLYGON (((151.20460 -33.53298, 151.20456... | 6.431615 | -0.260413 | 0.626528 | -0.790237 | NaN | NaN | 42105.0 | 1182.0 |
fig = px.scatter(all_z,
x = "score",
y = "med_ann_house_income",
hover_data = ["sa2_name"],
title = "Correlation between liveability and median annual household income",
labels = {"med_ann_house_income": "Median annual household income",
"score": "Liveability score"})
fig.show()
fig = px.scatter(all_z,
x = "score",
y = "avg_monthly_rent",
hover_data = ["sa2_name"],
title = "Correlation between liveability and average monthly rent",
labels = {"avg_monthly_rent": "Average monthly rent",
"score": "Liveability score"})
fig.show()
Interestingly enough, the correlation between our calculated liveability score and the financial data is not so strong overall. There is a very weak positive correlation between these two variables.
Amazingly, it seems the liveability score is seemed to be influenced more by region rather than stats — as in, if you hover over the lower left data points in either of the above graphs, these are all south-western or western sydney suburbs.
Let's graph this on a map to see if this is true.
all_z_plot = all_z.copy().set_index("sa2_code")
all_z_plot.rename(columns = {"school_z": "School Catchment Areas",
"accom_z": "Accommodation and Food Services",
"retail_z": "Retail Services",
"health_z": "Health Services",
"crime_z": "Crime Hotspot Areas",
"score": "Liveability Score"}, inplace = True)
all_z_plot.fillna(-999, inplace = True)
import plotly.express as px
from jupyter_dash import JupyterDash
from dash import Dash, dcc, html, Input, Output
app = JupyterDash(__name__)
app.layout = html.Div([
html.H2("Greater Sydney Liveability Analysis"),
html.P("Select a variable on which to analyse liveability:"),
dcc.RadioItems(
id = "variable",
options = ["School Catchment Areas",
"Accommodation and Food Services",
"Retail Services",
"Health Services",
"Crime Hotspot Areas",
"Liveability Score"],
value = "School Catchment Areas",
inline = True
),
dcc.Graph(id = "graph"),
html.H4("Opacity slider"),
dcc.Slider(min = 0,
max = 1,
step = 0.1,
value = 0.8,
id = "opacity",
marks = None,
tooltip = {"placement": "bottom", "always_visible": True}),
html.P("""Note: greyed out areas have been deemed
'unliveable' for the purposes of this assignment.
The recorded value (-999) is just a placeholder
and is not of any significance.""")
])
@app.callback(
Output("graph", "figure"),
Input("variable", "value"),
Input("opacity", "value"))
def display_choropleth(variable, opacity):
max_z = all_z_plot[variable].max()
if variable != "Liveability Score":
min_z = -1
else:
min_z = 0
fig = px.choropleth_mapbox(
all_z_plot,
geojson = all_z_plot.geometry,
color = variable,
range_color = [min_z, max_z],
color_continuous_scale = [
[0, 'gray'],
[0.0000000000001, 'red'],
[1, 'blue']
],
locations = all_z_plot.index,
labels = {"sa2_code": "SA2 code",
"School Catchment Areas": "School Catchments (z)",
"Accommodation and Food Services": "Acc & Food (z)",
"Retail Services": "Retail (z)",
"Health Services": "Health (z)",
"Crime Hotspot Areas": "Crime (z)"
},
hover_name = "sa2_name",
mapbox_style = "open-street-map",
center = {"lat": -33.8148, "lon": 151.0017}, # centered on Parramatta
zoom = 7,
opacity = opacity
)
fig.update_geos(fitbounds = "locations", visible = False)
fig.update_layout(margin = {"t": 20, "r": 0, "b": 0, "l": 0})
return fig
app.run_server(mode = "inline")
Wallee is a single 31-year-old male, who works as a data scientist in Sydney at the company ‘Canva’. He enjoys spending time walking and appreciating nature and wants the ability to walk or cycle and minimise his use of cars and public transport to minimise his carbon footprint. Hence why he is looking to move to the City of Sydney close by to his office, with lots of trees. Furthermore, among his friends, Wallee is known for his determination to protect the environment as he takes part in several voluntary initiatives which strive to address the impacts of climate change and massive energy usage on nature. Hence why he deems it a conflict of interest to live in a neighbourhood with large amounts of energy usage and green house emissions. He’s also lowkey hipster and prefers thrifting over buying new stuff from retail stores. His favourite past time at the moment is enjoying his Saturday morning coffees underneath the canopy of jacarandas growing in his front yard. As per the assignment:
Only perform analysis on SA2’s which fall under the City of Sydney (note, this should be SA2’s and suburbs where the SA3 is ‘Sydney Inner City’.)
Hence you will often see in the SQL queries below that we include the condition WHERE sa3_name = 'Sydney Inner City'.
electricity¶Note a key difference between the SA2 dataset provided by the Australian Bureau of Statistics (ABS) and the electricity dataset provided by the City of Sydney (CoS): suburb definitions are different. See the two maps produced below.
def createMapbox(df, owner):
fig = px.choropleth_mapbox(
df,
geojson = df.geom,
locations = df.index,
title = f"{owner} definitions of City of Sydney neighbourhoods",
labels = {"sa2_name": "suburb"},
mapbox_style = "open-street-map",
center = {"lat": -33.883, "lon": 151.206}, # centered on Redfern
zoom = 11,
opacity = 0.8
)
fig.update_geos(fitbounds = "locations", visible = False)
fig.update_layout(margin = {"t": 70, "r": 0, "b": 10, "l": 0})
return fig
ABS_CityOfSydney = gpd.read_postgis("SELECT * FROM greatersydney WHERE sa3_name = 'Sydney Inner City'", conn)
ABS_CityOfSydney_plot = ABS_CityOfSydney.set_index("sa2_name")
fig_ABS = createMapbox(ABS_CityOfSydney_plot, "ABS")
electricity = gpd.read_postgis("SELECT * FROM electricity", conn)
electricity_plot = electricity.set_index("suburb")
fig_COS = createMapbox(electricity_plot, "CoS")
fig_ABS.show()
fig_COS.show()
We can see from comparing the above two maps that:
Now we must try and see CoS suburbs fall under which ABS suburb definitions.
sql = """
SELECT GS.sa2_name, E.suburb AS cos_suburb
FROM greatersydney GS JOIN electricity E ON ST_Within(ST_Transform(E.geom, 4283), GS.geom)
WHERE GS.sa3_name = 'Sydney Inner City'
GROUP BY GS.sa2_name, E.suburb
"""
query(conn, sql)
| sa2_name | cos_suburb |
|---|
From the above query, it appears that there are no CoS geometries completely contained within an ABS geometry. This is terrible news for us — there must be slight overlaps or differences within the geometries that cause the ST function to return false.
sql = """
SELECT GS.sa2_name, E.suburb AS cos_suburb
FROM greatersydney GS JOIN electricity E ON ST_Intersects(ST_Transform(E.geom, 4283), GS.geom)
WHERE GS.sa3_name = 'Sydney Inner City'
GROUP BY GS.sa2_name, E.suburb
"""
query(conn, sql).head(7)
| sa2_name | cos_suburb | |
|---|---|---|
| 0 | Darlinghurst | Darlinghurst |
| 1 | Darlinghurst | Paddington |
| 2 | Darlinghurst | Potts Point |
| 3 | Darlinghurst | Rushcutters Bay |
| 4 | Darlinghurst | Surry Hills |
| 5 | Darlinghurst | Sydney |
| 6 | Darlinghurst | Woolloomooloo |
Indeed, looking at the above table it seems that the geometries do not exactly match up. Darlinghurst from the ABS dataset touches 6 surrounding suburbs in the CoS dataset! This means we will not be able to automate the process of joining these suburb geometries.
However, even manually joining these suburbs would not only be a painstaking activity, but it would simply be inaccurate. For example, if you closely examine the boundaries of Redfern - Chippendale in the ABS dataset and the suburbs of Waterloo + Moore Park and Redfern in the CoS dataset, these actually are very distinctly different. Any sort of aggregation we do of the data based on these incompatible geometries will not be accurate.
We also considered utilising the ST_Intersection function to find what part of which suburbs intersect with each other across the ABS and CoS datasets. We could then apply the ratio of this area compared to the original area and to the electricity usage data and use that as our value. However, we realised this is also very inaccurate as it relies on the assumption that electricity usage is uniform across a suburb. If you consider a suburb such as Waterloo + Moore Park, it is easy to conclude that electricity usage is definitely not equally distributed here — half of it is grass.
Hence we believe this dataset is not viable for this assignment. Let's move onto the next dataset.
trees¶This is a very simple dataset containing the latitude and longitude of every street and park tree in the City of Sydney. We can plot these points on a "scatter plot" using plotly's scatter_mapbox function.
trees = gpd.read_postgis("SELECT * FROM trees", conn)
trees.rename(columns = {"geom": "geometry"}, inplace = True)
fig = px.scatter_mapbox(trees,
lat = trees.geometry.y,
lon = trees.geometry.x,
hover_name = "id",
title = "Street and park trees in the City of Sydney",
mapbox_style = "open-street-map",
center = {"lat": -33.883, "lon": 151.206},
zoom = 11)
fig.update_layout(margin = {"t": 70, "r": 0, "b": 10, "l": 0})
Now let's perform a spatial join to aggregate the number of trees per SA2 neighbourhoods. The query below uses ST_Within to check what SA2 neighbourhood the tree geom is within.
trees_SA2 = gpd.read_postgis("""
SELECT GS.sa2_code, GS.sa2_name, GS.geom, GS.areasqkm, COUNT(T.geom) AS trees
FROM trees T JOIN greatersydney GS ON ST_Within(ST_Transform(T.geom, 4283), GS.geom)
WHERE GS.sa3_name = 'Sydney Inner City'
GROUP BY GS.sa2_code, GS.sa2_name, GS.geom;
""", conn, "geom")
Now let's calculate trees per sqkm, and then finally the z-scores!
# calculating trees per sqkm
trees_SA2["trees_sqkm"] = trees_SA2["trees"]/trees_SA2["areasqkm"]
# calculating the z-score
trees_SA2["trees_z"] = (trees_SA2["trees_sqkm"] - trees_SA2["trees_sqkm"].mean()) / trees_SA2["trees_sqkm"].std()
trees_SA2.head(3)
| sa2_code | sa2_name | geom | areasqkm | trees | trees_sqkm | trees_z | |
|---|---|---|---|---|---|---|---|
| 0 | 117031329 | Darlinghurst | MULTIPOLYGON (((151.21227 -33.87633, 151.21232... | 0.8569 | 2059 | 2402.847473 | 0.735094 |
| 1 | 117031330 | Erskineville - Alexandria | MULTIPOLYGON (((151.18200 -33.90065, 151.18230... | 4.3200 | 8149 | 1886.342593 | -0.100382 |
| 2 | 117031331 | Glebe - Forest Lodge | MULTIPOLYGON (((151.18276 -33.87221, 151.18289... | 2.3018 | 5799 | 2519.332696 | 0.923516 |
trees_SA2_plot = trees_SA2.set_index("sa2_name")
fig = px.choropleth_mapbox(
trees_SA2_plot,
geojson = trees_SA2_plot.geom,
locations = trees_SA2_plot.index,
color = "trees_sqkm",
title = "Number of street and park trees in City of Sydney neighbourhoods per square kilometre",
labels = {"sa2_name": "suburb"},
mapbox_style = "open-street-map",
center = {"lat": -33.883, "lon": 151.206}, # centered on Redfern
zoom = 11,
opacity = 0.7
)
fig.update_geos(fitbounds = "locations", visible = False)
fig.update_layout(margin = {"t": 70, "r": 0, "b": 10, "l": 0})
all_z_cos = all_z.merge(trees_SA2)
all_z_cos.drop(columns = ["score", "med_ann_house_income", "avg_monthly_rent", "areasqkm", "trees", "trees_sqkm"], inplace = True)
all_z_cos.head(3)
| sa2_code | sa2_name | geom | school_z | accom_z | retail_z | health_z | crime_z | trees_z | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 117031329 | Darlinghurst | MULTIPOLYGON (((151.21227 -33.87633, 151.21232... | 3.005269 | 3.177448 | 1.304050 | 3.357766 | 4.195972 | 0.735094 |
| 1 | 117031330 | Erskineville - Alexandria | MULTIPOLYGON (((151.18200 -33.90065, 151.18230... | -0.037416 | 1.055970 | 1.846312 | -0.003969 | 0.645173 | -0.100382 |
| 2 | 117031331 | Glebe - Forest Lodge | MULTIPOLYGON (((151.18276 -33.87221, 151.18289... | -0.227354 | 0.467058 | -0.116496 | 0.429578 | 3.033614 | 0.923516 |
school_z 0.1.accom_z a 0.5.retail_z a 0.5.health_z a 0.8.crime_z a 3.trees_z a 4. Trees overrules all other variables.all_z_cos["sum"] = 0.1 * all_z_cos["school_z"] + 0.5 * all_z_cos["accom_z"] + 0.5 * all_z_cos["retail_z"] + 0.8 * all_z_cos["health_z"] - 3 * all_z_cos["crime_z"] + 4 * all_z_cos["trees_z"]
all_z_cos["score"] = 1 / (1 + np.exp(-all_z_cos["sum"]))
all_z_cos.drop(columns = "sum", inplace = True)
all_z_cos[["sa2_name", "score"]].sort_values(by = "score")
| sa2_name | score | |
|---|---|---|
| 7 | Surry Hills | 0.000019 |
| 5 | Pyrmont - Ultimo | 0.000074 |
| 6 | Redfern - Chippendale | 0.000283 |
| 4 | Potts Point - Woolloomooloo | 0.000668 |
| 3 | Newtown - Camperdown - Darlington | 0.002994 |
| 2 | Glebe - Forest Lodge | 0.007314 |
| 0 | Darlinghurst | 0.011891 |
| 1 | Erskineville - Alexandria | 0.290527 |
| 9 | Waterloo - Beaconsfield | 0.709076 |
| 8 | Sydney - Haymarket - The Rocks | 0.987333 |
all_z_cos_plot = all_z_cos.set_index("sa2_name")
fig = px.choropleth_mapbox(
all_z_cos_plot,
geojson = all_z_cos_plot.geom,
locations = all_z_cos_plot.index,
color = "score",
title = "Liveability score tailored to our stakeholder for City of Sydney",
labels = {"sa2_name": "suburb"},
mapbox_style = "open-street-map",
center = {"lat": -33.883, "lon": 151.206}, # centered on Redfern
zoom = 11,
opacity = 0.7
)
fig.update_geos(fitbounds = "locations", visible = False)
fig.update_layout(margin = {"t": 70, "r": 0, "b": 10, "l": 0})
Wallee, with his \$500k+ income, should invest in an apartment in Sydney - Haymarket - The Rocks. If he does not wish to throw away all his life savings, perhaps Waterloo - Beaconsfield is better. I've heard Green Square and Zetland are popping off these days ;)
conn.close()